{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "from sklearn.utils import shuffle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "read_dir='E:/文档/数据集/自适应实践数据集/matmat-2016-10/'\n",
    "save_dir='../data/mat2016/'\n",
    "\n",
    "answer_data=pd.read_csv(read_dir+'answers.csv',index_col=0)\n",
    "item_data=pd.read_csv(read_dir+'items.csv')\n",
    "skill_data=pd.read_csv(read_dir+'skills.csv')\n",
    "\n",
    "prob_count_limit=15 #每个学生做过的习题下限"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "record=answer_data.loc[:,['student','item','correct']].drop_duplicates(subset=['student','item'],keep='first')\\\n",
    "    .dropna(axis=0,how='any')\n",
    "record.columns=['user_id','item_id','score']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "有效的学生数： 6866\n"
     ]
    }
   ],
   "source": [
    "# 统计每个学生做了多少道题\n",
    "problem_counter=record.groupby(by='user_id').count()\n",
    "\n",
    "filtered_stu_boundary=prob_count_limit #学生做的题超过15道才算数\n",
    "\n",
    "filtered_stu_id=problem_counter[problem_counter['item_id']>filtered_stu_boundary].index.to_numpy()\n",
    "\n",
    "print('有效的学生数：',len(filtered_stu_id))\n",
    "\n",
    "record=record.set_index('user_id').loc[filtered_stu_id,:].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_data=item_data.loc[:,['id','skill','skill_lvl_1','skill_lvl_2','skill_lvl_3']]\\\n",
    "    .drop_duplicates(keep='first')\n",
    "item_data=item_data.melt(id_vars='id',value_vars=['skill','skill_lvl_1','skill_lvl_2','skill_lvl_3'])\\\n",
    "    .loc[:,['id','value']].drop_duplicates(keep='first').dropna(axis=0,how='any').astype('int')\n",
    "item_data.columns=['item_id','knowledge_code']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "skill=skill_data.loc[:,['id','parent']].fillna(-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_unique=np.unique(record['item_id'])\n",
    "know_unique=np.unique(item_data['knowledge_code'])\n",
    "stu_unique=np.unique(record['user_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "stu_old_new=dict(zip(stu_unique,range(1,len(stu_unique)+1)))\n",
    "item_old_new=dict(zip(item_unique,range(1,len(item_unique)+1)))\n",
    "know_old_new=dict(zip(know_unique,range(1,len(know_unique)+1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done\n"
     ]
    }
   ],
   "source": [
    "record['user_id']=record['user_id'].map(stu_old_new)\n",
    "record['item_id']=record['item_id'].map(item_old_new)\n",
    "\n",
    "item_data['item_id']=item_data['item_id'].map(item_old_new)\n",
    "item_data['knowledge_code']=item_data['knowledge_code'].map(know_old_new)\n",
    "\n",
    "skill['id']=skill['id'].map(know_old_new)\n",
    "skill['parent']=skill['parent'].map(know_old_new)\n",
    "skill=skill.dropna(axis=0,how='any').astype('int')\n",
    "skill.columns=['knowledge_code','parent']\n",
    "\n",
    "print('Done')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_open_df=item_data.set_index('item_id')\n",
    "\n",
    "item_df=pd.DataFrame(columns=['item_id','knowledge_code'],index=range(1,len(item_unique)+1))\n",
    "for item_i in range(1,len(item_unique)+1):\n",
    "    item_df.loc[item_i,'item_id']=item_i\n",
    "    item_df.loc[item_i,'knowledge_code']=np.array(item_open_df.loc[item_i,['knowledge_code']]).reshape(-1).tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_df.to_csv(save_dir+'item.csv',index=False)\n",
    "record.to_csv(save_dir+'record.csv',index=False)\n",
    "skill.to_csv(save_dir+'skill.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "学习者数： 6866\n",
      "习题数： 1847\n",
      "知识点数： 445\n"
     ]
    }
   ],
   "source": [
    "print('学习者数：',len(stu_unique))\n",
    "print('习题数：',len(item_unique))\n",
    "print('知识点数：',len(know_unique))"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "527a93331b4b1a8345148922acc34427fb7591433d63b66d32040b6fbbc6d593"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit ('pytorch': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
